library(ggplot2)
library(gcookbook) # For the data set
library(plyr) ##
# List the two columns we'll use
head(heightweight[, c("ageYear", "heightIn")])
## ageYear heightIn
## 1 11.92 56.3
## 2 12.92 62.3
## 3 12.75 63.3
## 4 13.42 59.0
## 5 15.92 62.5
## 6 14.25 62.5
ggplot( heightweight, aes( x = ageYear, y = heightIn)) +
geom_point()
## To use different shapes in a scatter plot, set shape.
ggplot( heightweight, aes( x = ageYear, y = heightIn)) + geom_point( shape = 21)
## The size of the points can be controlled with size.
ggplot( heightweight, aes( x = ageYear, y = heightIn)) + geom_point( size = 1.5)
# Show the three columns we'll use
head(heightweight[, c("sex", "ageYear", "heightIn")])
## sex ageYear heightIn
## 1 f 11.92 56.3
## 2 f 12.92 62.3
## 3 f 12.75 63.3
## 4 f 13.42 59.0
## 5 f 15.92 62.5
## 6 f 14.25 62.5
ggplot( heightweight, aes( x = ageYear, y = heightIn, colour = sex)) +
geom_point()
ggplot( heightweight, aes( x = ageYear, y = heightIn, shape = sex)) +
geom_point()
## It is possible to map a variable to both shape and colour,
ggplot( heightweight, aes( x = ageYear, y = heightIn, shape = sex, colour = sex)) + geom_point()
## set different shapes and colors for the grouping variables
ggplot( heightweight, aes( x = ageYear, y = heightIn, shape = sex, colour = sex)) +
geom_point() +
scale_shape_manual( values = c(1,2)) +
scale_colour_brewer( palette ="Set1")
ggplot( heightweight, aes( x = ageYear, y = heightIn)) +
geom_point( shape = 3)
# Use slightly larger points and use a shape scale with custom values
ggplot( heightweight, aes( x = ageYear, y = heightIn, shape = sex)) +
geom_point( size = 3) +
scale_shape_manual( values = c( 1, 4))
## It’s possible to have the shape represent one variable and the fill (empty or solid) represent another variable.
# Make a copy of the data
hw <- heightweight
# Categorize into < 100 and > = 100 groups
hw$weightGroup <- cut( hw$weightLb, breaks = c(-Inf, 100, Inf),
labels = c("< 100", "> = 100"))
# Use shapes with fill and color, and use colors that are empty (NA) and # filled
ggplot( hw, aes( x = ageYear, y = heightIn, shape = sex, fill = weightGroup)) +
geom_point( size = 2.5) + scale_shape_manual( values = c( 21, 24)) +
scale_fill_manual( values = c( NA, "black"), guide = guide_legend( override.aes = list( shape = 21)))
# List the four columns we'll use
head(heightweight[, c("sex", "ageYear", "heightIn", "weightLb")])
## sex ageYear heightIn weightLb
## 1 f 11.92 56.3 85.0
## 2 f 12.92 62.3 105.0
## 3 f 12.75 63.3 108.0
## 4 f 13.42 59.0 92.0
## 5 f 15.92 62.5 112.5
## 6 f 14.25 62.5 112.0
ggplot( heightweight, aes( x = ageYear, y = heightIn, colour = weightLb)) +
geom_point()
ggplot( heightweight, aes( x = ageYear, y = heightIn, size = weightLb )) +
geom_point()
## set the fill gradient to go from black to white and make the points larger
ggplot( heightweight, aes( x = ageYear, y = heightIn, fill = weightLb)) +
geom_point( shape = 21, size = 2.5) +
scale_fill_gradient( low ="black", high ="white")
# Using guide_legend() will result in a discrete legend instead of a colorbar
ggplot( heightweight, aes( x = ageYear, y = heightIn, fill = weightLb)) +
geom_point( shape = 21, size = 2.5) +
scale_fill_gradient( low ="black", high ="white", breaks = seq( 70, 170, by = 20), guide = guide_legend())
## best
ggplot( heightweight, aes( x = ageYear, y = heightIn, size = weightLb, colour = sex)) +
geom_point( alpha =.5) +
scale_size_area() + # Make area proportional to numeric value
scale_colour_brewer( palette ="Set1")
You have many points and they obscure each other.
sp <- ggplot( diamonds, aes( x = carat, y = price))
sp +
geom_point()
## We can make the points semitransparent using alpha,
sp +
geom_point( alpha =.1)
sp +
geom_point( alpha =.01)
## Another solution is to bin the points into rectangles and map the density of the points to the fill color of the rectangles,
sp +
stat_bin2d()
sp +
stat_bin2d( bins = 50) +
scale_fill_gradient( low ="lightblue", high ="red", limits = c( 0, 6000))
Another alternative is to bin the data into hexagons instead of rectangles, with stat_binhex. It works just like stat_bin2d().
library( hexbin)
sp +
stat_binhex() +
scale_fill_gradient( low ="lightblue", high ="red", limits = c( 0, 8000))
sp +
stat_binhex() +
scale_fill_gradient( low ="lightblue", high ="red", breaks = c( 0, 250, 500, 1000, 2000, 4000, 6000),
limits = c( 0, 6000))
Overplotting can also occur when the data is discrete on one or both axes. In these cases, you can randomly jitter the points with position_jitter().
sp1 <- ggplot( ChickWeight, aes( x = Time, y = weight))
sp1 + geom_point()
sp1 + geom_point( position ="jitter")
# Could also use geom_jitter(), which is equivalent
sp1 + geom_point( position = position_jitter( width =.5, height = 0))
##
sp1 + geom_boxplot( aes( group = Time))
# The base plot
sp <- ggplot( heightweight, aes( x = ageYear, y = heightIn))
# By default, stat_smooth() also adds a 95% confidence region for the regression fit
sp +
geom_point() +
stat_smooth( method = lm)
# 99% confidence region
sp +
geom_point() +
stat_smooth( method = lm, level = 0.99)
# No confidence region
sp +
geom_point() +
stat_smooth( method = lm, se = FALSE)
# The default color of the fit line is blue. This can be change by setting colour.
sp +
geom_point( colour ="grey60") +
stat_smooth( method = lm, se = FALSE, colour ="black")
# If you add stat_smooth() without specifying the method, it will use a loess (locally weighted polynomial)
sp +
geom_point( colour ="grey60") +
stat_smooth()
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
sp +
geom_point( colour ="grey60") +
stat_smooth( method = loess)
Another common type of model fit is a logistic regression.
library( MASS) # For the data set
b <- biopsy
b$classn[ b$class =="benign"] <- 0
b$classn[ b$class =="malignant"] <- 1
head(b)
## ID V1 V2 V3 V4 V5 V6 V7 V8 V9 class classn
## 1 1000025 5 1 1 1 2 1 3 1 1 benign 0
## 2 1002945 5 4 4 5 7 10 3 2 1 benign 0
## 3 1015425 3 1 1 1 2 2 3 1 1 benign 0
## 4 1016277 6 8 8 1 3 4 3 7 1 benign 0
## 5 1017023 4 1 1 3 2 1 3 1 1 benign 0
## 6 1017122 8 10 10 8 7 10 9 7 1 malignant 1
# we’ll just look at the relationship of V1 (clump thickness) and the class of the tumor.
ggplot( b, aes( x = V1, y = classn)) +
geom_point( position = position_jitter( width = 0.3, height = 0.06), alpha = 0.4, shape = 21, size = 1.5) +
stat_smooth( method = glm, family = binomial)
# If your scatter plot has points grouped by a factor, using colour or shape, one fit line will be drawn for each group.
sps <- ggplot( heightweight, aes( x = ageYear, y = heightIn, colour = sex)) +
geom_point() +
scale_colour_brewer( palette ="Set1")
sps + geom_smooth()
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
# If you want the lines to extrapolate from the data
sps +
geom_smooth( method = lm, se = FALSE, fullrange = TRUE)
You have already created a fitted regression model object for a data set, and you want to plot the lines for that model.
model <- lm( heightIn ~ ageYear + I(ageYear^2), heightweight)
model
##
## Call:
## lm(formula = heightIn ~ ageYear + I(ageYear^2), data = heightweight)
##
## Coefficients:
## (Intercept) ageYear I(ageYear^2)
## -10.3136 8.6673 -0.2478
# Create a data frame with ageYear column, interpolating across range
xmin <- min( heightweight$ageYear)
xmax <- max( heightweight$ageYear)
predicted <- data.frame( ageYear = seq( xmin, xmax, length.out = 100))
predicted$heightIn <- predict( model, predicted)
head(predicted)
## ageYear heightIn
## 1 11.58000 56.82624
## 2 11.63980 57.00047
## 3 11.69960 57.17294
## 4 11.75939 57.34363
## 5 11.81919 57.51255
## 6 11.87899 57.67969
# We can now plot the data points along with the values predicted from the model
sp <- ggplot( heightweight, aes( x = ageYear, y = heightIn)) +
geom_point( colour ="grey40")
sp + geom_line( data = predicted, size = 1)
# Given a model, predict values of yvar from xvar
# This supports one predictor and one predicted variable
# xrange: If NULL, determine the x range from the model object. If a vector with
# two numbers, use those as the min and max of the prediction range.
# samples: Number of samples across the x range.
# ...: Further arguments to be passed to predict()
predictvals <- function( model, xvar, yvar, xrange = NULL, samples = 100, ...) {
# If xrange isn't passed in, determine xrange from the models.
# Different ways of extracting the x range, depending on model type
if (is.null(xrange)) {
if (any( class( model) %in% c("lm", "glm")))
xrange <- range( model $ model[[ xvar]])
else if (any( class( model) %in% "loess")) xrange <- range( model $ x)
}
newdata <- data.frame( x = seq( xrange[1], xrange[2], length.out = samples))
names( newdata) <- xvar
newdata[[ yvar]] <- predict( model, newdata = newdata, ...)
newdata
}
modlinear <- lm( heightIn ~ ageYear, heightweight)
modloess <- loess( heightIn ~ ageYear, heightweight)
lm_predicted <- predictvals( modlinear, "ageYear", "heightIn")
loess_predicted <- predictvals( modloess, "ageYear", "heightIn")
sp +
geom_line( data = lm_predicted, colour ="red", size =.8) +
geom_line( data = loess_predicted, colour ="blue", size =.8)
For glm models that use a nonlinear link function, you need to specify type =" response"
library( MASS) # For the data set
b <- biopsy
b$classn[ b$class =="benign"] <- 0
b$classn[ b$class =="malignant"] <- 1
head(b)
## ID V1 V2 V3 V4 V5 V6 V7 V8 V9 class classn
## 1 1000025 5 1 1 1 2 1 3 1 1 benign 0
## 2 1002945 5 4 4 5 7 10 3 2 1 benign 0
## 3 1015425 3 1 1 1 2 2 3 1 1 benign 0
## 4 1016277 6 8 8 1 3 4 3 7 1 benign 0
## 5 1017023 4 1 1 3 2 1 3 1 1 benign 0
## 6 1017122 8 10 10 8 7 10 9 7 1 malignant 1
# glm model
fitlogistic <- glm( classn ~ V1, b, family = binomial)
# Get predicted values
glm_predicted <- predictvals( fitlogistic, "V1", "classn", type ="response")
ggplot( b, aes( x = V1, y = classn)) +
geom_point( position = position_jitter( width =.3, height =.08), alpha = 0.4, shape = 21, size = 1.5) +
geom_line( data = glm_predicted, colour ="#1177FF", size = 1)
With the heightweight data set, we’ll make a linear model with lm() for each of the levels of sex, and put those model objects in a list. The model building is done with a function, make_model()
make_model <- function( data) {
lm( heightIn ~ ageYear, data)
}
models <- dlply( heightweight, "sex", .fun = make_model) # Print out the list of two lm objects, f and m models
models
## $f
##
## Call:
## lm(formula = heightIn ~ ageYear, data = data)
##
## Coefficients:
## (Intercept) ageYear
## 43.963 1.209
##
##
## $m
##
## Call:
## lm(formula = heightIn ~ ageYear, data = data)
##
## Coefficients:
## (Intercept) ageYear
## 30.658 2.301
##
##
## attr(,"split_type")
## [1] "data.frame"
## attr(,"split_labels")
## sex
## 1 f
## 2 m
# Now that we have the list of model objects, we can run predictvals() to get predicted values from each model, using the ldply() function
predvals <- ldply( models, .fun = predictvals, xvar ="ageYear", yvar ="heightIn")
head(predvals)
## sex ageYear heightIn
## 1 f 11.58000 57.96250
## 2 f 11.63980 58.03478
## 3 f 11.69960 58.10707
## 4 f 11.75939 58.17936
## 5 f 11.81919 58.25165
## 6 f 11.87899 58.32394
# Finally, we can plot the data with the predicted values
ggplot( heightweight, aes( x = ageYear, y = heightIn, colour = sex)) +
geom_point() +
geom_line( data = predvals)
predvals <- ldply( models, .fun = predictvals, xvar ="ageYear", yvar ="heightIn",
xrange = range( heightweight$ageYear))
ggplot( heightweight, aes( x = ageYear, y = heightIn, colour = sex)) +
geom_point() + geom_line( data = predvals)
model <- lm( heightIn ~ ageYear, heightweight)
summary( model)
##
## Call:
## lm(formula = heightIn ~ ageYear, data = heightweight)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.3517 -1.9006 0.1378 1.9071 8.3371
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.4356 1.8281 20.48 <2e-16 ***
## ageYear 1.7483 0.1329 13.15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.989 on 234 degrees of freedom
## Multiple R-squared: 0.4249, Adjusted R-squared: 0.4225
## F-statistic: 172.9 on 1 and 234 DF, p-value: < 2.2e-16
# First generate prediction data
pred <- predictvals( model, "ageYear", "heightIn")
sp <- ggplot( heightweight, aes( x = ageYear, y = heightIn)) +
geom_point() + geom_line( data = pred)
##
sp +
annotate("text", label ="r^2 = 0.42", x = 16.5, y = 52)
##
sp +
annotate("text", label ="r^2 == 0.42", parse = TRUE, x = 16.5, y = 52)
## It’s possible to automatically extract values from the model object and build an expression
eqn <- as.character(
as.expression( substitute( italic( y) == a + b * italic( x) * "," ~~ italic(r)^2 ~ "=" ~ r2,
list( a = format( coef( model)[1], digits = 3),
b = format( coef( model)[2], digits = 3),
r2 = format( summary(model)$r.squared,
digits = 2) ))))
eqn
## [1] "italic(y) == \"37.4\" + \"1.75\" * italic(x) * \",\" ~ ~italic(r)^2 ~ \"=\" ~ \"0.42\""
# Parsing turns it into an expression
parse( text = eqn)
## expression(italic(y) == "37.4" + "1.75" * italic(x) * "," ~ ~italic(r)^2 ~
## "=" ~ "0.42")
# Now that we have the expression string, we can add it to the plot.
sp + annotate("text", label = eqn, parse = TRUE, x = Inf, y =-Inf, hjust = 1.1, vjust =-.5)
ggplot( faithful, aes( x = eruptions, y = waiting)) +
geom_point() +
geom_rug()
# To reduce the overplotting, we can jitter the line positions and make them slightly thinner by specifying size
ggplot( faithful, aes( x = eruptions, y = waiting)) +
geom_point() +
geom_rug( position ="jitter", size =.2)
# we’ll just take the subset of countries that spent more than $ 2000 USD per capita:
subset( countries, Year == 2009 & healthexp > 2000)
## Name Code Year GDP laborrate healthexp infmortality
## 254 Andorra AND 2009 NA NA 3089.636 3.1
## 560 Australia AUS 2009 42130.82 65.2 3867.429 4.2
## 611 Austria AUT 2009 45555.43 60.4 5037.311 3.6
## 968 Belgium BEL 2009 43640.20 53.5 5104.019 3.6
## 1733 Canada CAN 2009 39599.04 67.8 4379.761 5.2
## 2702 Denmark DNK 2009 55933.35 65.4 6272.729 3.4
## 3365 Finland FIN 2009 44576.73 60.9 4309.604 2.5
## 3416 France FRA 2009 40663.05 56.1 4797.966 3.5
## 3671 Germany DEU 2009 40658.58 59.8 4628.769 3.5
## 3824 Greece GRC 2009 28936.48 53.7 3040.734 3.5
## 4436 Iceland ISL 2009 37972.24 77.5 3130.391 1.7
## 4691 Ireland IRL 2009 49737.93 63.6 4951.845 3.4
## 4844 Italy ITA 2009 35073.32 49.1 3327.630 3.2
## 4946 Japan JPN 2009 39456.44 59.5 3321.466 2.4
## 5864 Luxembourg LUX 2009 106252.24 55.5 8182.855 2.2
## 6680 Monaco MCO 2009 172676.34 NA 7137.390 3.4
## 7088 Netherlands NLD 2009 48068.35 66.1 5163.740 3.8
## 7190 New Zealand NZL 2009 29352.45 68.6 2633.625 4.9
## 7445 Norway NOR 2009 78408.71 66.9 7661.610 2.9
## 7955 Portugal PRT 2009 22029.87 62.5 2409.661 3.2
## 8312 San Marino SMR 2009 NA NA 4089.271 1.9
## 8822 Slovenia SVN 2009 24101.26 58.9 2174.718 2.5
## 9077 Spain ESP 2009 31891.39 58.6 3075.013 4.0
## 9536 Sweden SWE 2009 43406.17 64.8 4251.976 2.4
## 9587 Switzerland CHE 2009 63524.65 66.9 7140.729 4.1
## 10454 United Kingdom GBR 2009 35163.41 62.2 3285.050 4.7
## 10505 United States USA 2009 45744.56 65.0 7410.163 6.6
# To manually add annotations, use annotate(),
sp <- ggplot( subset( countries, Year == 2009 & healthexp > 2000), aes( x = healthexp, y = infmortality)) +
geom_point()
sp +
annotate("text", x = 4350, y = 5.4, label ="Canada") +
annotate("text", x = 7400, y = 6.8, label ="USA")
# To automatically add the labels from your data we use geom_text() and map a column that is a factor or character vector to the label aesthetic.
sp +
geom_text( aes( label = Name), size = 4)
sp +
geom_text( aes( label = Name), size = 4, vjust = 0)
# Add a little extra to y
sp +
geom_text( aes( y = infmortality +.1, label = Name), size = 4, vjust = 0)
sp +
geom_text( aes( label = Name), size = 4, hjust = 0)
sp +
geom_text( aes( x = healthexp + 100, label = Name), size = 4, hjust = 0)
# you can add a new column to your data frame containing just the labels you want.
cdat <- subset( countries, Year == 2009 & healthexp > 2000)
cdat$Name1 <- cdat$Name
idx <- cdat$Name1 %in% c(" Canada", "Ireland", "United Kingdom", "United States", "New Zealand", "Iceland", "Japan", "Luxembourg", "Netherlands", "Switzerland")
head(idx)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE
# Then we’ll use that Boolean vector to overwrite all the other entries in Name1 with NA:
cdat$Name1[!idx] <- NA
# Now we can make the plot
ggplot( cdat, aes( x = healthexp, y = infmortality)) +
geom_point() +
geom_text( aes( x = healthexp + 100, label = Name1), size = 4, hjust = 0) +
xlim( 2000, 10000)
## Warning: Removed 18 rows containing missing values (geom_text).
p <- ggplot( cdat, aes( x = healthexp, y = infmortality, size = GDP)) +
geom_point( shape = 21, colour ="black", fill ="cornsilk")
# GDP mapped to radius (default with scale_size_continuous)
p
## Warning: Removed 2 rows containing missing values (geom_point).
# GDP mapped to area instead, and larger circles
p + scale_size_area( max_size = 15)
## Warning: Removed 2 rows containing missing values (geom_point).
# Add up counts for male and female
hec <- HairEyeColor[,,"Male"] + HairEyeColor[,,"Female"]
# Convert to long format
library( reshape2)
hec <- melt( hec, value.name ="count")
ggplot( hec, aes( x = Eye, y = Hair)) +
geom_point( aes( size = count), shape = 21, colour ="black", fill ="cornsilk") +
scale_size_area( max_size = 20, guide = FALSE) +
geom_text( aes( y = as.numeric(Hair)-sqrt(count)/ 22, label = count), vjust = 1, colour ="grey60", size = 4)
c2009 <- subset( countries, Year == 2009, select = c( Name, GDP, laborrate, healthexp, infmortality))
head(c2009)
## Name GDP laborrate healthexp infmortality
## 50 Afghanistan NA 59.8 50.88597 103.2
## 101 Albania 3772.605 59.5 264.60406 17.2
## 152 Algeria 4022.199 58.5 267.94653 32.0
## 203 American Samoa NA NA NA NA
## 254 Andorra NA NA 3089.63589 3.1
## 305 Angola 4068.576 81.3 203.80787 99.9
# make the scatter plot matrix
pairs( c2009[, 2: 5])
panel.cor <- function( x, y, digits = 2, prefix ="", cex.cor, ...) {
usr <- par("usr")
on.exit( par(usr))
par( usr = c( 0, 1, 0, 1))
r <- abs( cor( x, y, use ="complete.obs"))
txt <- format( c( r, 0.123456789), digits = digits)[1]
txt <- paste( prefix, txt, sep ="")
if( missing( cex.cor)) cex.cor <- 0.8 / strwidth(txt)
text( 0.5, 0.5, txt, cex = cex.cor * (1 + r) / 2)
}
panel.hist <- function( x, ...) {
usr <- par("usr")
on.exit( par(usr))
par( usr = c( usr[1:2], 0, 1.5) )
h <- hist( x, plot = FALSE)
breaks <- h$breaks
nB <- length(breaks)
y <- h$counts
y <- y/max( y)
rect( breaks[-nB], 0, breaks[-1], y, col ="white", ...)
}
pairs( c2009[,2:5], upper.panel = panel.cor,
diag.panel = panel.hist,
lower.panel = panel.smooth)
# It may be more desirable to use linear regression lines instead of LOWESS lines.
panel.lm <- function (x, y, col = par("col"), bg = NA, pch = par("pch"), cex = 1, col.smooth = "black", ...) {
points( x, y, pch = pch, col = col, bg = bg, cex = cex)
abline( stats::lm( y ~ x), col = col.smooth, ...)
}
pairs( c2009[, 2:5], pch =".",
upper.panel = panel.cor,
diag.panel = panel.hist,
lower.panel = panel.lm)